# Creates a list containing testing and training dataframes
createTraining <- function(data, seed = 123, trainPercent = 0.8) {
set.seed(seed)
n <- nrow(data)
numTrain <- floor(trainPercent * n)
trainingRows <- sample(1:n, size = numTrain, replace = FALSE)
trainingData <- data[trainingRows, ]
testingData <- data[-trainingRows, ]
return(list(training = trainingData, testing = testingData))
}
# Creates confidence and prediction intervals
jjIntervals <- function(data, model) {
confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
rename(confLwr = lwr, confUpr = upr)
prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
rename(predictLwr = lwr, predictUpr = upr) %>%
select(predictLwr, predictUpr)
intervalData <- cbind(data,confidence,prediction)
return(intervalData)
}
# Creates a density plot given parameters
jjplotDensity <- function(data,x,fill,color) {
plot <- ggplot(data, aes(x={{x}})) +
geom_density(aes(fill={{fill}}), alpha=0.4)+
geom_rug(aes(color={{color}}), y=0) +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a boxplot given parameters
jjplotBoxplot <- function(data,x,y,fill) {
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, fill = {{fill}})) +
geom_boxplot() +
coord_flip() +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a scatter plot
jjplotPoint <- function(data,x,y,color, model) {
data <- jjIntervals(data,model)
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, color = {{color}})) +
geom_point() +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
theme_custom()
return(plot)
}
# Checks a model and gives back error and p-value
checkModel <- function(data, matrix) {
n <- nrow(data)
error <- (matrix[1,2] + matrix[2,1])/n
pHat <- (matrix[2,1]+matrix[2,2])/n
standardError <- sqrt(pHat*(1-pHat)/n)
pValue <- pnorm(error,pHat,standardError)
return(list("error" = error,"pValue" = pValue))
}
# Gets a prediction from the model
getPredict <- function(data, model) {
recidPredict <- predict.glm(model, newdata=data, type="response")
dataWithPredictions <- cbind(data,recidPredict)
return(dataWithPredictions)
}
# RMSE calculation given two lists of numbers
RMSE <- function(predict, obs) {
RMSE <- sqrt(mean((predict - obs)^2, na.rm = TRUE))
return(RMSE)
}
# full data set
recid <- read.csv("datasets/Project3Sample4000.csv")
# mystery project data
recidMystery <- read.csv("datasets/Project3Mystery100.csv")
# male female split
recidMale <- read.csv("datasets/Project3Males1500.csv")
recidFemale <- read.csv("datasets/Project3Females1500.csv")
Recidivism is a term used within the criminal justice system which means “the tendency of a criminal to reoffend after serving a sentence in a disciplinary institution.” The data we will be analyzing is from Broward County, Florida and includes recidivism predictions from two proprietary tests that are given to inmates. These predictions are in the form of decile scores which are attributed to an inmates recidivation likelihood and violence level. The goal of this analysis is to, based on various factors of an inmate, predict whether or not they will reoffend within two years of being released as well as enforce or debunk the tests used to classify this from those proprietary organizations.
By the end of this analysis, we hope to have an accurate classification model for whether or not a person is likely to reoffend as well as have the ability to discuss the accuracy of the model in detail.
Finally, we hope to understand the ethical implications of the model we make and to know how to mitigate and/or measure the biases held by the model itself.
This task revolves around visualizing the data and making the data we are given usable. We clean the full data set found in the Project3Sample4000.csv file. This includes data cleaning, feature engineering, and data refining along with the creation of a testing training split.
## Data Cleaning
recid2 <- recid %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidDecileScore = RiskRecidDecileScore,
riskRecidScoreLevel = RiskRecidScoreLevel,
riskRecidScreeningDate = RiskRecidScreeningDate,
riskViolenceDecileScore = RiskViolenceDecileScore,
riskViolenceScoreLevel = RiskViolenceScoreLevel
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
) %>%
select(-name, -dob) %>%
filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))
## Data Engineering
recid3 <- recid2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount + 0.1),
chargeDescCount = str_count(chargeDesc),
logChargeDescCount = log10(chargeDescCount+0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount + 0.1)
)
# Category Removal
recid4 <- recid3 %>%
select(
-race
)
## Testing Training Split for Logistic Models
testingTraining <- createTraining(recid4, seed=8675309)
recidTraining <- testingTraining$training
recidTesting <- testingTraining$testing
## Testing Training Split for Multiple Regression Models
testingTraining2 <- createTraining(recid3, seed = 859)
recidTraining2 <- testingTraining2$training
recidTesting2 <- testingTraining2$testing
# Mystery Data Matching
recidMystery2 <- recidMystery %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidScreeningDate = RiskRecidScreeningDate,
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
) %>%
select(-dob) %>%
filter((!is.na(jailIn) | !is.na(jailOut)))
recidMystery3 <- recidMystery2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount + 0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount + 0.1)
)
# Male Vs. Female Data and Cleaning
recidMale2 <- recidMale %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidDecileScore = RiskRecidDecileScore,
riskRecidScoreLevel = RiskRecidScoreLevel,
riskRecidScreeningDate = RiskRecidScreeningDate,
riskViolenceDecileScore = RiskViolenceDecileScore,
riskViolenceScoreLevel = RiskViolenceScoreLevel
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
) %>%
select(-name, -dob) %>%
filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))
recidFemale2 <- recidFemale %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidDecileScore = RiskRecidDecileScore,
riskRecidScoreLevel = RiskRecidScoreLevel,
riskRecidScreeningDate = RiskRecidScreeningDate,
riskViolenceDecileScore = RiskViolenceDecileScore,
riskViolenceScoreLevel = RiskViolenceScoreLevel
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
) %>%
select(-name, -dob) %>%
filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))
recidMale3 <- recidMale2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount + 0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount + 0.1)
)
recidFemale3 <- recidFemale2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount + 0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount + 0.1)
)
maleTestingTraining <- createTraining(recidMale3, seed = 123)
maleTraining <- maleTestingTraining$training
maleTesting <- maleTestingTraining$testing
femaleTestingTraining <- createTraining(recidFemale3, seed = 123)
femaleTraining <- femaleTestingTraining$training
femaleTesting <- femaleTestingTraining$testing
In our data cleaning, we forced date variables to be stored as dates and factor variables to be factors. Then we engineered a few categories in the data - some for convenience and some for purpose. These include the following: daysInJail (difference between entry and exit of jail), logDaysInJail (log base 10 of daysInJail), logPriorsCount (log base 10 of priorsCount), juvCount (total of all juvenile crime categories), and logJuvCount (log base 10 of juvCount). We then remove name, dob, and race because name is irrelevant to recidivism, dob is covered by the included age category, and race is not fair to include in a predictive model as there is no definitive difference aside from visually between two people of different races.
In this section we will be creating three different models based on the data set given and now refined. Our first two models will be logistic regression models to predict whether or not an inmate will reoffend within two years of their release both including and excluding the proprietary test scores included in the data. Our third model will predict the risk of recidivism score. This score was given by a written test given to prisoners and was computed by a third-party company that uses a black-box algorithm to compute the value of riskRecidDecileLevel. Our fourth model will predict the violence score of an inmate (riskViolenceDecileScore. This was once more calculated from a test administered to inmates and was computed by a third-party company using a black-box algorithm.
This is our first model. It is a logistic regression (classification) model which predicts whether or not an inmate will reoffend within two years of being released. The following plots are some data visualization relevant to this model.
### DaysInJail Plot
p1 <- recidTraining %>%
jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Days in Jail",
x = "Days in Jail"
)
p2 <- recidTraining %>%
jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Log10 of Days in Jail",
x = "log10(daysInJail)"
)
p3 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=daysInJail, fill=as.factor(isRecid)) +
labs(
title="Days in Jail",
y = "Days in Jail",
x = "Reoffence Prediction Proportion"
)
p4 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logDaysInJail, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
labs(
title="Log Base 10 of Days in Jail",
y = "log10(daysInJail)",
x = "Reoffence Prediction Proportion",
fill = "Reoffence Prediction Proportion"
)
p1 + p2 + p3 + p4 +
plot_annotation(
title = "Days in Jail and log10(Days in Jail)",
theme=theme_custom()
) + plot_layout(guides = 'collect')
Figure 1 illustrates that a higher proportion of inmates who spent less time in prison when compared to the proportion of prisoners likely to reoffend who spent a longer duration in jail. This is shown more clearly in the right two plots as the left two plots are so heavily skewed left that they are not very readable. The left two plots are included to demonstrate that taking the log base 10 of daysInJail eliminates much of the leftward skew therefore being a more sensitive predictor to be included within a model than simply daysInJail.
### Priors Count
p5 <- recidTraining %>%
jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Priors Counts",
x = "Priors Counts"
)
p6 <- recidTraining %>%
jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Priors Counts + 0.1)",
x = "log10(Priors Counts + 0.1)"
)
p7 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=priorsCount, fill=as.factor(isRecid)) +
labs(
title="Priors Counts",
y = "Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p8 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logPriorsCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Priors Counts + 0.1)",
y = "log10(Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p5 + p6 + p7 + p8 + plot_annotation(title = "Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 2 illustrates that taking the log base 10 of the number of prior offenses (plus 0.1 to avoid taking the log10 of 0) improves the sensitivity of the predictions made with that variable as well as reducing the number of outliers included in the data which means the model will better predict whether or not an inmate will reoffend after being released. For this reason, we will be using the log base 10 of prior count as opposed to just priorCount.
### Juvenile Priors Count
p9 <- recidTraining %>%
jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
x = "Juvenile Priors Counts"
)
p10 <- recidTraining %>%
jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
x = "log10(Juvenile Priors Counts + 0.1)"
)
p11 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=juvCount, fill=as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
y = "Juvenile Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p12 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logJuvCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
y = "log10(Juvenile Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p9 + p10 + p11 + p12 + plot_annotation(title = "Juvenile Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 3 shows that we can rule out prior crimes committed in juvenile years. There was not enough relevant data in these plots to include the variable or the log base 10 of the variable; therefore, this plot is important to include as a justification of refining our model later on. This is one of the few instances where we could simply rule out a variable this early in the process.
### Age
p13 <- recidTraining %>%
jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Age",
x = "Age"
)
p14 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=age, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="Age",
y = "Age",
x = "Recidivated",
fill = "Recidivated"
)
p13 / p14 + plot_annotation(theme=theme_custom())
Figure 4 shows that age is a factor in whether or not a person will reoffend. Shown by the density and boxplots, we can determine that in this data, there is a higher proportion of reoffenders in younger populations and a higher proportion of non-reoffenders in older populations. The age cutoff in the trend is 35 years old. This informs us that we could potentially use age as a predictor in the model to predict recidivism.
### Sex
ggplot(data=recidTraining,aes(x=sex, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Sex",
x = "Sex",
fill = "Recidivated"
) +
theme_custom()
Figure 5 takes into account sex as a factor determining reoffending rates. This plot, regarding reoffending rates between male and female populations, shows us that men have a higher probability in relation to male populations to reoffend than women have in relation to the population of women. This means that this variable would most likely be useful to the model; however, sex along with race will not be included for ethical reasons as it is not ethical or fair to judge someone more harshly based on their sex or race.
### ChargeDegree
ggplot(data=recidTraining,aes(x=chargeDegree, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Charge Degree",
x = "Charge Degree",
fill = "Recidivated"
) +
theme_custom()
Figure 6 illustrates the difference in recidivism rate based on charge degree. Despite these being ordered properly from most severe crime to least severe crime, there does not seem to be a consistent pattern by eye. This means we can leave this variable in the initial model but it may be refined out later.
### Colinearity Check
p15 <- ggplot(recidTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs log10(Priors Count)",
x = "log10(Days In Jail)",
y = "log10(Priors Count)",
color = "Recidivated"
) +
theme_custom()
p16 <- ggplot(recidTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs Age",
x = "log10(Days In Jail)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p17 <- ggplot(recidTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Priors Count) vs Age",
x = "log10(Priors Count)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p15 / (p16 + p17) + plot_annotation(title = "Colinearity Check", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 7 shows us that none of the three variables (age, logDaysInJail, and logPriorsCount) are colinear. The three subplots in conjunction show us that each of these three variables is not colinear with any of the others, and, therefore, all of them may be included in the model to potentially be refined out later.
## Everything Model
everythingModel <- glm(isRecid ~ age + priorsCount + daysInJail + logPriorsCount + logDaysInJail,data=recidTraining, family = binomial)
everythingPredictTrain <- getPredict(recidTraining, everythingModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTrain <- table(everythingPredictTrain$isRecid,everythingPredictTrain$prediction)
everythingPredictTest <- getPredict(recidTesting, everythingModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTest <- table(everythingPredictTest$isRecid,everythingPredictTest$prediction)
#checkModel(recidTraining,everythingMatrixTrain)
#checkModel(recidTesting,everythingMatrixTest)
## Basic Model
baseModel <- glm(isRecid ~ sex + age + priorsCount + daysInJail,data=recidTraining, family = binomial)
basePredictTrain <- getPredict(recidTraining, baseModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTrain <- table(basePredictTrain$isRecid,basePredictTrain$prediction)
basePredictTest <- getPredict(recidTesting, baseModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTest <- table(basePredictTest$isRecid,basePredictTest$prediction)
#checkModel(recidTraining,baseMatrixTrain)
#checkModel(recidTesting,baseMatrixTest)
## Log Model
logModel <- glm(isRecid ~ sex + age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)
logPredictTrain <- getPredict(recidTraining, logModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTrain <- table(logPredictTrain$isRecid,logPredictTrain$prediction)
logPredictTest <- getPredict(recidTesting, logModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTest <- table(logPredictTest$isRecid,logPredictTest$prediction)
#checkModel(recidTraining,logMatrixTrain)
#checkModel(recidTesting,logMatrixTest)
## Recid Model
recidModel <- glm(isRecid ~ sex + age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)
recidPredictTrain <- getPredict(recidTraining, recidModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTrain <- table(recidPredictTrain$isRecid,recidPredictTrain$prediction)
recidPredictTest <- getPredict(recidTesting, recidModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTest <- table(recidPredictTest$isRecid,recidPredictTest$prediction)
#checkModel(recidTraining,recidMatrixTrain)
#checkModel(recidTesting,recidMatrixTest)
## Recid 2 Model
recid2Model <- glm(isRecid ~ age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)
recid2PredictTrain <- getPredict(recidTraining, recid2Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTrain <- table(recid2PredictTrain$isRecid,recid2PredictTrain$prediction)
recid2PredictTest <- getPredict(recidTesting, recid2Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTest <- table(recid2PredictTest$isRecid,recid2PredictTest$prediction)
#checkModel(recidTraining,recid2MatrixTrain)
#checkModel(recidTesting,recid2MatrixTest)
## Recid 3 Model
recid3Model <- glm(isRecid ~ age + logPriorsCount, data=recidTraining, family = binomial)
recid3PredictTrain <- getPredict(recidTraining, recid3Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTrain <- table(recid3PredictTrain$isRecid,recid3PredictTrain$prediction)
recid3PredictTest <- getPredict(recidTesting, recid3Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTest <- table(recid3PredictTest$isRecid,recid3PredictTest$prediction)
#checkModel(recidTraining,recid3MatrixTrain)
#checkModel(recidTesting,recid3MatrixTest)
## Recid 4 Model
recid4Model <- glm(isRecid ~ age + sex + logPriorsCount, data=recidTraining, family = binomial)
recid4PredictTrain <- getPredict(recidTraining, recid4Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTrain <- table(recid4PredictTrain$isRecid,recid4PredictTrain$prediction)
recid4PredictTest <- getPredict(recidTesting, recid4Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTest <- table(recid4PredictTest$isRecid,recid4PredictTest$prediction)
#checkModel(recidTraining,recid4MatrixTrain)
#checkModel(recidTesting,recid4MatrixTest)
recidMysteryBox <- read.csv("datasets/Project3Mystery100.csv")
## Recid 5 Model
recid5Model <- glm(isRecid ~ age + priorsCount, data=recidTraining, family = binomial)
recid5PredictTrain <- getPredict(recidTraining, recid5Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTrain <- table(recid5PredictTrain$isRecid,recid5PredictTrain$prediction)
recid5PredictTest <- getPredict(recidTesting, recid5Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTest <- table(recid5PredictTest$isRecid,recid5PredictTest$prediction)
#checkModel(recidTraining,recid5MatrixTrain)
#checkModel(recidTesting,recid5MatrixTest)
## Recid 6 Model
recid6Model <- glm(isRecid ~ age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)
recid6PredictTrain <- getPredict(recidTraining, recid6Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTrain <- table(recid6PredictTrain$isRecid,recid6PredictTrain$prediction)
recid6PredictTest <- getPredict(recidTesting, recid6Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTest <- table(recid6PredictTest$isRecid,recid6PredictTest$prediction)
#stored for use in markdown
trainList6 <- checkModel(recidTraining,recid6MatrixTrain)
trainError <- trainList6$error
trainPVal <- trainList6$pValue
testList6 <- checkModel(recidTesting,recid6MatrixTest)
testError <- testList6$error
testPVal <- testList6$pValue
matrix = recid6MatrixTest
matrix1 = recid6MatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)
coeffs = recid6Model$coefficients
nullHyp = (matrix1[2,1]+matrix1[2,2])/nrow(recidTraining)
canWe = ifelse(testPVal > 0.05, "can", "cannot")
continueAnalysis = ifelse(testPVal > 0.05, "should continue analyzing this data and other related data to create a statistically significant model", "should proceed to propose a new hypothesis which we may test in future analysis")
## Final Model
finalModel <- recid6Model
# explain equation, training error, training confusion matrix, explain confusion matrix, same for testing, p-value, null hypothesis
The model we made from this data to predict whether or not an inmate will reoffend without the use of the proprietary recidivation and violence scores included in the data is as follows:
\[willReoffend = \lfloor 0.596374 + -0.0367996 \cdot age + \]
\[0.8183817 \cdot log_{10}(priorsCount) + 0.0302903 \cdot log_{10}(daysInJail) \rceil\]
The model equation above has a mixed pairing of one floor and one ceiling symbol, this is intended to communicate proper rounding.
In this model we began with every relevant variable (excluding the proprietary scores which we will include in the next model) which could reliably build a model when testing-training splits were made at random (this excluded the categorical variable chargeDegree due to its limited number of data points per category and chargeDesc as it was too much work to refine and had a very limited number of data points per category). We also chose to exclude race and sex due to the ethical concerns of including factors which would theoretically have no impact on the model considering equality between races and genders. From our everything model, we ruled out more factors due to their low significance and chose to include the log base 10 of certain variables as justified by the plots included in figures 1-7 where we discussed the log base 10 variants of these variables. After this, we gradually removed one variable at a time until we came to the minimal number of variables that still had a highly accurate prediction which resulted in the model shown above.
Some model coefficients listed above are a bit obscure in their effect. For example, the coefficients of the logged values indicate that the prediction (ideally between zero and one but rounded) will go up or down according to every power of ten included in the original variable. This is in contrast to the coefficient for the linear term, age, which indicates that the prediction will go down by 0.0367996 likelihood to reoffend for every year of age the inmate has.
This model, when demonstrated on the data it was trained on, has a prediction error of \(0.306875\) meaning that approximately \(30.69\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(30.69\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.
This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{705 + 277}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.306875.
This model, when demonstrated on testing data, has a prediction error of \(0.27875\) meaning that approximately \(27.88\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(27.88\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.
This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{150 + 73}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.27875.
This model’s p-value regarding the testing data is \(0.001304\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(34.91\%\) of inmates will reoffend as this is the percentage of inmates in the data that do reoffend. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion. The p-value acquired from this hypothesis test tells us we cannot reject the null hypothesis meaning we should proceed to propose a new hypothesis which we may test in future analysis.
This is our second model. It is a logistic regression (classification) model which predicts whether or not an inmate will reoffend within two years of being released. This time we are including in our model the black-box scores (riskRecidDecileScore and riskViolenceDecileScore) which are included in the data and were computed through use of a proprietary algorithm from a third party. The plots made for the prior model are all still relevant in this case. The following plot is an addition to the prior grouping which incorporate our new relevant predictor, riskRecidDecileScore.
### Risk Recid Score
recidTraining %>%
jjplotDensity(x = riskRecidDecileScore, fill = fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
labs(
title="Risk Recidivation Score Accuracy in Predicting Recidivation",
fill = "Recidivated",
x = "Risk Recidivation Score"
) +
theme_custom()
Figure 8 compares riskRecidDecileScore with the actual recidivation value of the inmate which makes for a nice accuracy comparison. We see in this plot what we would expect: as the recidivation score goes up, the proportion of inmates who reoffend increases in relation to the inmates who do not. This holds true until a recidivation score of roughly 8.0 where both progress downward. This is likely due to less data existing at this extreme; however, it could simply indicate that the model included in the data for riskRecidDecileScore does not accurately predict recidivation in the upper values of their 10 point scale.
riskModel <- glm(isRecid ~ age + logPriorsCount + logDaysInJail + riskRecidDecileScore + riskViolenceDecileScore, data=recidTraining, family = binomial)
#summary(riskModel)
riskModel2 <- glm(isRecid ~ age + logPriorsCount + logDaysInJail + riskRecidDecileScore, data=recidTraining, family = binomial)
#summary(riskModel2)
riskModel3 <- glm(isRecid ~ age + logPriorsCount + riskRecidDecileScore, data=recidTraining, family = binomial)
#summary(riskModel2)
riskPredictTrain <- getPredict(recidTraining, riskModel3) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
riskMatrixTrain <- table(riskPredictTrain$isRecid,riskPredictTrain$prediction)
riskPredictTest <- getPredict(recidTesting, riskModel3) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
riskMatrixTest <- table(riskPredictTest$isRecid,riskPredictTest$prediction)
#(checkModel(recidTraining,riskMatrixTrain))
#(checkModel(recidTesting,riskMatrixTest))
#stored for use in markdown
trainList <- checkModel(recidTraining,riskMatrixTrain)
trainError <- trainList$error
trainPVal <- trainList$pValue
testList <- checkModel(recidTesting,riskMatrixTest)
testError <- testList$error
testPVal <- testList$pValue
matrix = recid6MatrixTest
matrix1 = recid6MatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)
coeffs = recid6Model$coefficients
nullHyp = (matrix[2,1]+matrix[2,2])/nrow(recidTesting)
canWe = ifelse(testPVal > 0.05, "can", "cannot")
continueAnalysis = ifelse(testPVal > 0.05, "should continue analyzing this data and other related data to create a statistically significant model", "should proceed to propose a new hypothesis which we may test in future analysis")
## Final Model
finalModel <- riskModel3
The model we made from this data to predict whether or not an inmate will reoffend with the use of the proprietary recidivation and violence scores included in the data is as follows:
\[willReoffend = \lfloor 0.596374 + -0.0367996 \cdot age +\] \[0.8183817 \cdot logPriorsCount + 0.0302903 \cdot riskRecidDecileScore \rceil\]
The model equation above has a mixed pairing of one floor and one ceiling symbol, this is intended to communicate proper rounding.
In this model we began the variables that ultimately were relevant to our last model but now with the added proprietary score variables riskRecidDecileScore and riskViolenceDecileScore. We still chose to exclude race and sex due to the ethical concerns of including factors which would theoretically have no impact on the model considering equality between races and genders. We ended up refining out the violence score and were left with the model above. This was the best model which still included one of the proprietary scores.
Some model coefficients listed above are obscure in their effect. For example, the coefficients of the logged values indicate that the prediction will go up or down according to every power of ten included in the original variable’s value. This is in contrast to the coefficient for the included linear term, age, which indicates that the prediction will change by -0.0367996 likelihood to reoffend for every year of age the inmate has.
This model, when demonstrated on the data it was trained on, has a prediction error of \(0.3075\) meaning that approximately \(30.75\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(30.75\%\) of inmates will be predicted to reoffend when they do not or will be predicted not to reoffend when they do.
This is information which can be found in our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. The combination of these two numbers over the total number of data points gives us our error measurement. \(errorPercentage = \frac{705 + 277}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.306875.
This model, when demonstrated on testing data, has a prediction error of \(0.28625\) meaning that approximately \(28.62\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(28.62\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.
This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{150 + 73}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.27875.
This model’s p-value regarding the testing data is \(0.0034229\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(32.88\%\) of inmates will reoffend as this is the percentage of inmates in the data that do reoffend. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion. The p-value acquired from this hypothesis test tells us we cannot reject the null hypothesis meaning we should proceed to propose a new hypothesis which we may test in future analysis.
This model was actually worse than the last model we made. This means that the proprietary scores are not good predictors of recidivism in the context of this data and it is better to disregard them and make a model which is entirely our own to predict this.
This is our third model. This model is a multiple regression model attempting to achieve parity with the algorithm which was used to calculate riskRecidDecileScore. The following illustrate some interesting relationships in this data.
## Age Cat vs riskRecidDecileScore
ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~ageCat)+
labs(
x = "Risk of Recidivation (Decile Score)",
fill = "Recidivated",
title = "Recidivation Decile Score Distribution by Age Category"
) +
theme_custom()
Figure 9 shows the relationship between an inmate’s age category and their riskRecidDecileScore. The plot shows that the recidivation decile score predicts the least accurately in people greater than 45 years old as the blue and red fields should cross at exactly five for perfect accuracy whereas they do not intersect at this point in the age category of 45 or older. The other columns behave properly and as expected. This will be included in our initial model to potentially ruled out later.
## Race vs riskRecidDecileScore
ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~race)+
labs(
x = "Risk of Recidivation (Decile Score)",
fill = "Recidivated",
title = "Recidivation Decile Score by Race"
) +
theme_custom()
Figure 10 shows the relationship between an inmate’s race and their riskRecidDecileScore. The plot shows that there is a heavy prediction bias for black inmates. This is a very concerning trend. Because we are trying to predict as closely as possible to the proprietary scores included, we will include race in our final model; however, this is not an ethical means of classifying or scoring individuals.
## Charge Degree vs riskRecidDecileScore
ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~chargeDegree)+
labs(
x = "Risk of Recidivation (Decile Score)",
fill = "Recidivated",
title = "Recidivation Decile Score by Charge Degree"
) +
theme_custom()
Figure 11 shows the relationship between the degree of an inmate’s criminal charge and their riskRecidDecileScore. This figure shows that there are very different relationships between each charge score and the accuracy of the riskRecidDecileScore. This indicates that chargeDegree is likely an indicator used in the proprietary model and as such we will include it in our model to potentially be refined out later.
scoreSubsetModel <- lm(riskRecidDecileScore ~ logPriorsCount + priorsCount + age + chargeDegree + logDaysInJail + daysInJail + sex + race, data = recidTraining2)
olsSubset <- ols_step_best_subset(scoreSubsetModel)
scoreFinalModel <- lm(riskRecidDecileScore ~ priorsCount + age + chargeDegree + logDaysInJail + race, data = recidTraining2)
scoreTrainingPredicts <- predict.lm(scoreFinalModel, newdata = recidTraining2)
scoreTestingPredicts <- predict.lm(scoreFinalModel, newdata = recidTesting2)
coeffs = scoreFinalModel$coefficients
trainRMSE <- RMSE(scoreTrainingPredicts, recidTraining2$riskRecidDecileScore)
rmse <- RMSE(scoreTestingPredicts, recidTesting2$riskRecidDecileScore)
corr = cor(scoreTestingPredicts, recidTesting2$riskRecidDecileScore)
The model given our training data to predict riskRecidDecileScore is as follows:
\[predictedRecidDecileScore = 5.5144564 + 0.2469088 \cdot priorsCount + -0.098403 \cdot age +\] \[0.6184054 \cdot chargeDeg_{F2} + 0.7978429 \cdot chargeDeg_{F3} + -1.2380345 \cdot chargeDeg_{F5} +\] \[0.0477814 \cdot chargeDeg_{F7} + 0.1860794 \cdot chargeDeg_{M1} + 0.3707825 \cdot chargeDeg_{M2} +\] \[1.6725884 \cdot chargeDeg_{MO3} + 0.696719 \cdot chargeDeg_{NI0} + 0.924403 \cdot logDaysInJail +\] \[0.7837351 \cdot race_{black} + -0.3393149 \cdot race_{hispanic} + -0.4842382 \cdot race_{other *non-white*}\] As can be seen, this model has some interesting components. Firstly, the last component is intended to include any race which is not black, hispanic, or white. White was not included in the model meaning that an inmate being white does not affect their recidivism prediction. This does not however mean that we can lump them in with the “other” category. Next, chargeDeg is intended to represent the chargeDegree variable. Most of these components are simply linear, meaning that being part of a specific category or having one extra increment in those variables such as chargeDegree or priorsCount respectively will directly add that value to your recidivation score. The logDaysInJail category on the other hand will add 0.924403 to your recidivation score for every factor of 10 that is included in the standard version of the variable, daysInJail.
This model started with all variables and was refined using the olsrr library to find the best subset model. After that, we constructed our final model and made our predictions to measure RMSE and correlation coefficient with.
autoplot(scoreFinalModel)[1:2] + theme_custom()
As can be seen in our diagnostic plots. This model is borderline acceptable, but not very good. The residual plot shows a \(-x^3\) pattern meaning there is likely some other factor explaining part of this data. We could not find this factor in the included data. The Q-Q plot shows that this model is fairly reliable unless we are predicting in the lower recidivation score range. From this, we can determine that this data shouldn’t be modeled in this way or is more likely missing a significant predictor to be completely accurate as this model is the best model which could be made that we found within this data.
Correlation Coefficient \(= 0.6864077\)
A correlation coefficient of \(0.6864077\) indicates that \(68.64\%\) of the variation in recidivation score is explained by the model. The closer this is to \(1\) or \(-1\), the better the model fits the data provided and, therefore, the more likely it is to be effective at predicting recidivation score. A value of 0.6864077 is not great, but would indicate that this model can predict recidivation score at the very least within an appropriate range of the actual value. We would need more effective indicators included in the inmate data in order to properly model this data with a higher correlation value.
In relation to our training data, this model has an RMSE of 2.0495382. This means that our predictions have a residual mean square error of 2.0495382 on a 10 point scale. An average of 2 points of error on the training data is quite bad. Let’s move onto the testing RMSE.
In relation to our testing data, this model has an RMSE of 2.0958573. This means that our predictions have a residual mean square error of 2.0958573 on a 10 point scale. An average of 2.1 points of error is not horrible, but it isn’t great either. The testing and training RMSE values don’t differ greatly which indicates that there are predictors missing from this data which are useful in predicting this value. Sadly we did not find a better model for this score which indicates that the model from the proprietary provider likely involved some information which was not included in this data.
This is our fourth model. This model is a multiple regression model attempting to achieve parity with the algorithm which was used to calculate riskViolenceDecileScore. The following plots show some meaningful relationships in the data relevant to this model.
## Age Cat vs riskViolenceDecileScore
ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~ageCat)+
labs(
title="Violence Score by Age Category",
x="Recidivation Violence Decile Score",
fill = "Recidivated"
) +
theme_custom()
Figure 12 shows different relationships between violence score and reoffence based on age range meaning that age range could be a predictor of violence score. Age category will be included in the initial model to be potentially refined out later.
## Race vs riskViolenceDecileScore
ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~race) +
labs(
title="Violence Score by Race",
x="Recidivation Violence Decile Score",
fill = "Recidivated"
) +
theme_custom()
Figure 13 shows, similarly to recidivation score, that black inmates have a different distribution of violence scores than other races meaning there could be some insitutional bias included here. Again, this is unethical to include on a normal basis, but because we are trying to best predict the model someone else has already made, we are considering race a potential factor and it will be included in the initial model.
## Charge Degree vs riskViolenceDecileScore
ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
geom_density(alpha = .4) +
facet_wrap(~chargeDegree)+
labs(
title="Violence Score by Charge Degree",
x="Recidivation Violence Decile Score",
fill = "Recidivated"
) +
theme_custom()
Figure 14 shows that charge degree has different distributions of violence scores meaning that it could be a good predictor of violence score and will be included in the initial model to be potentially refined out later.
## Best Subset
violenceSubsetModel <- lm(riskViolenceDecileScore ~ logPriorsCount + priorsCount + age + chargeDegree + logDaysInJail + daysInJail + sex + race, data = recidTesting2)
olsSubset <- ols_step_best_subset(violenceSubsetModel)
violenceFinalModel <- lm(riskViolenceDecileScore ~ priorsCount + age + logDaysInJail + race, data = recidTraining2)
violenceTrainingPredicts <- predict.lm(violenceFinalModel, newdata = recidTraining2)
violenceTestingPredicts <- predict.lm(violenceFinalModel, newdata = recidTesting2)
trainRMSE <- RMSE(violenceTrainingPredicts, recidTraining2$riskViolenceDecileScore)
rmse <- RMSE(violenceTestingPredicts, recidTesting2$riskViolenceDecileScore)
corr <- cor(violenceTestingPredicts, recidTesting2$riskViolenceDecileScore)
coeffs <- violenceFinalModel$coefficients
The model to predict riskViolenceDecileScore is as follows:
\[predictedViolenceScore = 6.7015258 + 0.1191035 \cdot priorsCount +\] \[-0.1227209 \cdot age + 0.7257646 \cdot logDaysInJail + 0.6358164 \cdot race_{black} +\] \[-0.169199 \cdot race_{hispanic} + -0.201816 \cdot race_{other *non-white*}\] Similarly to the last model, this model has some interesting components. Firstly, the last component is intended to include any race which is not black, hispanic, or white. White was not included in the model meaning that an inmate being white does not affect their recidivism prediction. This does not however mean that we can lump them in with the “other” category. Next, chargeDeg is intended to represent the chargeDegree variable. Most of these components are simply linear, meaning that being part of a specific category or having one extra increment in those variables such as race or priorsCount respectively will directly add that value to your recidivation score. The logDaysInJail category on the other hand will add 0.7257646 to your recidivation score for every factor of 10 that is included in the standard version of the variable, daysInJail.
This model started with all variables and was refined using the olsrr library to find the best subset model. After that, we constructed our final model and made our predictions to measure RMSE and correlation coefficient with.
autoplot(violenceFinalModel)[1:2] + theme_custom()
These diagnostic plots above show us that this model is not great. The residual plot shows a clear trend through all of the data’s range and the Q-Q plot only meets the guideline in the center of the plot at zero, immediately leaving the linear guide when moving left or right. We can see clearly that this model should not be modeled this way or has predictors used in the proprietary algorithm which are not included in the data.
Correlation Coefficient \(= 0.6735538\)
A correlation coefficient of \(0.6735538\) indicates that \(67.36\%\) of the variation in recidivation score is explained by the model. The closer this is to \(1\) or \(-1\), the better the model fits the data provided and, therefore, the more likely it is to be effective at predicting violence score. A value of 0.6735538 is not great, but would indicate that this model can predict violence score at the very least within an appropriate range of the actual value. We would need more effective indicators included in the inmate data in order to properly model this data with a higher correlation value.
In relation to our training data, this model has an RMSE of 1.7819866. This means that our predictions have a residual mean square error of 1.7819866 on a 10 point scale. An average of 1.8 points of error on the training data is quite bad. Let’s move onto the testing RMSE.
In relation to our testing data, this model has an RMSE of 1.8669443. This means that our predictions have a residual mean square error of 1.8669443 on a 10 point scale. An average of 1.9 points of error is not horrible,and is better than our recidivation score model, but it isn’t great either. The testing and training RMSE values don’t differ greatly which indicates that there are predictors missing from this data which are useful in predicting this value. Sadly we did not find a better model for this score which indicates that the model from the proprietary provider likely involved some information which was not included in this data.
Now that we have our four models, whether or not we really should be, we are going to use them to predict these modeled values in data points which do not have these values.
# replaced finalModel with recid6Model because finalModel is overwritten with riskModel3
willRecidivate <- ifelse(predict.glm(recid6Model, newdata = recidMystery3, type = "response") > .5, 1, 0)
# Predicting things :)
predictedRecidScore <- predict.lm(scoreFinalModel, newdata = recidMystery3)
predictedRecidScore <- ifelse(predictedRecidScore < 1, 1, round(predictedRecidScore, 0))
predictedViolenceScore <- predict.lm(violenceFinalModel, newdata = recidMystery3)
predictedViolenceScore <- ifelse(predictedViolenceScore < 1, 1, round(predictedViolenceScore, 0))
predictedDataframe <- cbind(recidMystery, willRecidivate, predictedRecidScore, predictedViolenceScore) %>%
select(personID, willRecidivate, predictedRecidScore, predictedViolenceScore)
# dark theme striped table with hover highlighting
kable(predictedDataframe) %>%
kable_material_dark(c("striped","hover"))
| personID | willRecidivate | predictedRecidScore | predictedViolenceScore |
|---|---|---|---|
| 1 | 0 | 2 | 1 |
| 2 | 0 | 1 | 1 |
| 3 | 0 | 6 | 5 |
| 4 | 0 | 3 | 3 |
| 5 | 0 | 6 | 5 |
| 6 | 0 | 3 | 3 |
| 7 | 0 | 5 | 4 |
| 8 | 0 | 3 | 3 |
| 9 | 0 | 5 | 5 |
| 10 | 1 | 5 | 4 |
| 11 | 0 | 6 | 5 |
| 12 | 1 | 8 | 6 |
| 13 | 0 | 4 | 3 |
| 14 | 1 | 5 | 5 |
| 15 | 0 | 6 | 5 |
| 16 | 0 | 4 | 5 |
| 17 | 0 | 3 | 2 |
| 18 | 0 | 2 | 2 |
| 19 | 0 | 3 | 3 |
| 20 | 0 | 3 | 3 |
| 21 | 0 | 3 | 2 |
| 22 | 0 | 1 | 1 |
| 23 | 1 | 8 | 6 |
| 24 | 1 | 7 | 5 |
| 25 | 0 | 5 | 5 |
| 26 | 1 | 5 | 5 |
| 27 | 0 | 4 | 4 |
| 28 | 0 | 4 | 3 |
| 29 | 0 | 3 | 2 |
| 30 | 0 | 6 | 4 |
| 31 | 0 | 5 | 4 |
| 32 | 0 | 5 | 4 |
| 33 | 0 | 3 | 3 |
| 34 | 0 | 5 | 4 |
| 35 | 1 | 6 | 5 |
| 36 | 0 | 3 | 3 |
| 37 | 0 | 1 | 1 |
| 38 | 0 | 2 | 2 |
| 39 | 0 | 4 | 4 |
| 40 | 0 | 3 | 3 |
| 41 | 0 | 5 | 5 |
| 42 | 0 | 2 | 1 |
| 43 | 0 | 3 | 3 |
| 44 | 0 | 5 | 5 |
| 45 | 0 | 5 | 4 |
| 46 | 0 | 1 | 1 |
| 47 | 0 | 5 | 5 |
| 48 | 0 | 5 | 5 |
| 49 | 0 | 4 | 3 |
| 50 | 0 | 5 | 5 |
| 51 | 0 | 9 | 5 |
| 52 | 0 | 3 | 3 |
| 53 | 0 | 4 | 3 |
| 54 | 0 | 5 | 4 |
| 55 | 1 | 8 | 6 |
| 56 | 0 | 3 | 3 |
| 57 | 0 | 3 | 3 |
| 58 | 1 | 7 | 7 |
| 59 | 0 | 4 | 3 |
| 60 | 0 | 4 | 4 |
| 61 | 0 | 2 | 2 |
| 62 | 0 | 5 | 5 |
| 63 | 0 | 1 | 1 |
| 64 | 0 | 2 | 1 |
| 65 | 1 | 6 | 5 |
| 66 | 0 | 5 | 5 |
| 67 | 0 | 3 | 3 |
| 68 | 1 | 7 | 4 |
| 69 | 0 | 4 | 4 |
| 70 | 0 | 2 | 1 |
| 71 | 0 | 5 | 5 |
| 72 | 0 | 5 | 4 |
| 73 | 0 | 2 | 2 |
| 74 | 0 | 5 | 4 |
| 75 | 0 | 2 | 2 |
| 76 | 0 | 5 | 5 |
| 77 | 0 | 4 | 3 |
| 78 | 1 | 6 | 6 |
| 79 | 1 | 7 | 6 |
| 80 | 0 | 4 | 3 |
| 81 | 1 | 5 | 4 |
| 82 | 1 | 6 | 5 |
| 83 | 0 | 5 | 4 |
| 84 | 1 | 6 | 5 |
| 85 | 0 | 2 | 1 |
| 86 | 0 | 5 | 5 |
| 87 | 1 | 4 | 4 |
| 88 | 0 | 4 | 4 |
| 89 | 0 | 5 | 4 |
| 90 | 0 | 4 | 4 |
| 91 | 0 | 5 | 5 |
| 92 | 0 | 4 | 4 |
| 93 | 0 | 4 | 5 |
| 94 | 0 | 1 | 1 |
| 95 | 0 | 4 | 3 |
| 96 | 0 | 3 | 2 |
| 97 | 0 | 3 | 2 |
| 98 | 1 | 7 | 4 |
| 99 | 1 | 6 | 6 |
| 100 | 1 | 7 | 6 |
And there’s an estimation of your missing data!!!
This is our classification model which predicts whether or not an inmate will reoffend given the fact that all of the inmates analyzed are male.
### DaysInJail
p25 <- maleTraining %>%
jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "Days in Jail",
x = "Days in Jail",
)+
theme_custom()
p26 <- maleTraining %>%
jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "log10(Days in Jail",
x = "log10(Days in Jail)",
)+
theme_custom()+
theme(legend.position = "none")
p27 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = daysInJail, fill = as.factor(isRecid)) +
labs(
title = "Days in Jail",
x = "Recidivated",
y = "Days in Jail"
)+
theme_custom() +
theme(legend.position = "none")
p28 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = logDaysInJail, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
labs(
title = "log10(Days in Jail)",
x = "Recidivated",
y = "log10(Days in Jail)",
fill = "Recidivated"
)+
theme_custom()
p25 + p26 + p27 + p28 +
plot_annotation(
title = "Days in Jail and log10(Days in Jail)",
theme=theme_custom()
) + plot_layout(guides = "collect")
Figure 15 shows the relationship between daysInJail and whether or not an inmate will reoffend. The figure shows clearly that the log base 10 of daysInJail is more sensitive and effective at predicting whether or not an inmate will reoffend as shown by the lowered skewness of the plot and the lowered number of outliers on the boxplot graph. We will include logDaysInJail in our model with the potential for it to be ruled out later.
### Priors Count
p26 <- maleTraining %>%
jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "Priors Counts",
x = "Priors Counts"
)+
theme_custom()+
theme(legend.position = "none")
p27 <- maleTraining %>%
jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "log10(Priors Counts) + 0.1",
x = "log10(Priors Counts) + 0.1"
)+
theme_custom()+
theme(legend.position = "none")
p28 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = priorsCount, fill = as.factor(isRecid)) +
labs(
title = "Priors Counts",
x = "Recidivated",
y = "Priors Counts",
fill = "Recidivated"
)+
theme_custom()
p29 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = logPriorsCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title = "log10(Priors Counts) + 0.1",
x = "Recidivated",
y = "log10(Priors Counts) + 0.1",
)+
theme_custom()+
theme(legend.position = "none")
p26 + p27 + p28 + p29 + plot_annotation(title = "Male Recidivation by Priors Counts",theme=theme_custom()) + plot_layout(guides = "collect")
Figure 16 shows the relationship between priorsCount and whether or not an inmate roeffended. It is clear by the plots that the log base 10 of priorsCount is a more sensitive and better indicator of whether or not an inmate will reoffend than the standard priorsCount. Therefore, we will include logPriorsCount in our model to be potentially refined out later.
### Juvenile Priors Count
p30 <- maleTraining %>%
jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "Juvenile Priors Counts",
x = "Juvenile Priors Counts"
)
p31 <- maleTraining %>%
jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "log10(Juvenile Priors Counts) + 0.1",
x = "log10(Juvenile Priors Counts) + 0.1"
)
p32 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = juvCount, fill = as.factor(isRecid)) +
labs(
title = "Juvenile Priors Counts",
x = "Recidivated",
y = "Juvenile Priors Counts",
fill = "Recidivated"
)
p33 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = logJuvCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title = "log10(Juvenile Priors Counts) + 0.1",
x = "Recidivated",
y = "log10(Juvenile Priors Counts) + 0.1",
fill = "Recidivated"
)
p30 + p31 + p32 + p33 + plot_annotation(title = "Male Recidivation by Juvenile Priors Counts",theme=theme_custom()) + plot_layout(guides = "collect")
Figure 17 shows us that juvenilePriorCounts is not an effective predictor of recidivation in males as there is an extreme amount of left skew indicating that there is not a lot of variation in data for this category which makes this category rather ineffective to predict anything for this model.
### Age
p34 <- maleTraining %>%
jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title = "Age",
x = "Juvenile Priors Counts"
)
p35 <- maleTraining %>%
jjplotBoxplot(x = isRecid, y = age, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title = "Age",
x = "Recidivated",
y = "Age",
fill = "Recidivated"
)
p34 / p35 + plot_annotation(title = "Male Recidivation by Age", theme=theme_custom())
Figure 18 shows us that age appears to be a factor in whether or not male inmates reoffend. This factor will be included in our initial males model to potentially be refined out later.
### Risk Recid Score
maleTraining %>%
jjplotDensity(x = riskRecidDecileScore, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0"), color = as.factor(isRecid)) +
labs(
title = "Male Recidivation by Risk Recidivation Score",
x = "Risk Recidivation Decile Score",
fill ="Recidivated"
)
Figure 19 shows us that riskRecidivationDecileScore (RRDS) is a fairly good predictor for male inmates; however, like earlier in the bisex model, this falls off around RRDS 8. This will be included as an initial predictor to potentially be refined out later.
### Colinearity Check
p36 <- ggplot(maleTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
geom_point() +
labs(
title = "log10(Days In Jail) vs log10(Priors Count)",
x = "log10(Days In Jail)",
y = "log10(Priors Count)",
color = "Recidivated"
) +
theme_custom()
p37 <- ggplot(maleTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
geom_point() +
labs(
title = "log10(Days In Jail) vs Age",
x = "log10(Days In Jail)",
y = "Age",
color = "Recidivated"
)+
theme_custom()
p38 <- ggplot(maleTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
geom_point() +
labs(
title = "log10(Priors Count) vs Age",
x = "log10(Priors Count)",
y = "Age",
color = "Recidivated"
)+
theme_custom()
p36 / (p37 + p38) + plot_annotation(title = "Colinearity Check",theme=theme_custom()) + plot_layout(guides = "collect")
This is our classification model which predicts whether or not an inmate will reoffend given the fact that all of the inmates analyzed are female.
Story exists, no write-up yet. # Ethical Implications of Classification Models in the Criminal Justice System Not drafted yet, will have a draft and have someone edit it. # Conclusion Not yet… # References Will get there.